Data Visualization#

Once upon a time there were plots upon plots upon plots.

Load data#

Hide code cell source
import pandas as pd

# It is right in front of you. You just need to open your eyes:
df = pd.read_excel('../data/al_atlas_main_results.xlsx', index_col=0).sort_index()

Main Figures#

Bokeh#

import sys
sys.path.append('../')
from source.bokeh_plots import *

plot_bokeh(df)
Loading BokehJS ...

Discovery Figures#

Kaplan-Meier Plots#

Overall study population#

df_train = df[df['Train-Test'] == 'Train Sample']
df_test = df[df['Train-Test'] == 'Test Sample']

df_train = df_train[df_train['Clinical Trial'].isin(['AAML0531','AAML1031',
                                                     'AAML03P1','CCG2961'])]
### Select samples from AAML1031, 0531, and 03P1 clinical trials
df1 = df[df['Clinical Trial'].isin(['AAML0531', 'AAML1031', 'AAML03P1'])]

print(
    f'{df.shape[0]-df1.shape[0]} samples were removed. {df1.shape[0]} samples remaining.')

### Select diagnostic samples only
df2 = df1[df1['Sample Type'].isin(
    ['Diagnosis', 'Primary Blood Derived Cancer - Bone Marrow', 'Primary Blood Derived Cancer - Peripheral Blood'])]

print(
    f'{df1.shape[0]-df2.shape[0]} samples were removed. {df2.shape[0]} samples remaining.')

### Remove duplicate samples
df_cog = df2[~df2['Patient_ID'].duplicated(keep='last')]

print(
    f'{df2.shape[0]-df_cog.shape[0]} samples were removed. {df_cog.shape[0]} samples remaining.')
2228 samples were removed. 1281 samples remaining.
332 samples were removed. 949 samples remaining.
9 samples were removed. 940 samples remaining.
Hide code cell source
# Import Plotting Functions
from source.data_visualization_functions import *

model_name = 'AML Epigenomic Risk'
for dataset, trial in zip([df_cog, df_test], 
                          ['COG AML Trials', 'St. Jude AML Trials']):
    draw_kaplan_meier(model_name=model_name,
                        df=dataset,
                        save_survival_table=False,
                        save_plot=False,
                        show_ci=False,
                        add_risk_counts=False,
                        trialname=trial)
../_images/7a9772699e20792c8ba5261bc85fa050ec4d1da5f6c041058587f58ce80bda0c.png ../_images/6abe7b4c02465a7e4b390f680cdf1d77cf048d466135fa893754a608058fbf75.png

Per risk group#

Hide code cell source
for dataset, trial in zip([df_cog, df_test], 
                          ['COG AML Trials', 'St. Jude AML Trials']):
    draw_kaplan_meier(model_name=model_name,
                            df=dataset[dataset['Risk Group'] == 'High Risk'],
                            save_plot=False,
                            save_survival_table=False,
                            add_risk_counts=False,
                            trialname=trial + ' High Risk')

    draw_kaplan_meier(model_name=model_name,
                            df=dataset[dataset['Risk Group'] == 'Low Risk'],
                            save_plot=False,
                            save_survival_table=False,
                            add_risk_counts=False,
                            trialname=trial + ' Low Risk')

    draw_kaplan_meier(model_name=model_name,
                            df=dataset[dataset['Risk Group'] == 'Standard Risk'],
                            save_plot=False,
                            save_survival_table=False,
                            add_risk_counts=False,
                            trialname=trial + ' Standard Risk')
../_images/4d6c9a4a1e9342c1e3a29e06a725b9b38bbb7b419c752e095a40d44b4eb62a33.png ../_images/dd2d7a7eac3595bf743258a6e96282605faebc00496a42ae7d2445f26cd0e1b8.png ../_images/4878d9ce6a063a818cbc40aba94d5c9d12574b5d6f4152eb13f05219eea1912f.png ../_images/bc07c6e426ee200ead01b7e7a2e517183351d84a701c57b441e48e906e4285e8.png ../_images/f0c7bb39555a94d51bcd9f435df3166fa5c50fd8ee4fc322c8a9936a338384c3.png ../_images/e2e05eb61ce98254040f132b602feb279fd12e4b9dfadc19a3846a0feb7e926c.png
for dataset, trial in zip([df_cog], 
                          ['COG AML Trials']):
    draw_kaplan_meier(model_name=model_name,
                            df=dataset[dataset['Risk Group AAML1831'] == 'High'],
                            save_plot=False,
                            save_survival_table=False,
                            add_risk_counts=False,
                            trialname=trial + ' High Risk')

    draw_kaplan_meier(model_name=model_name,
                            df=dataset[dataset['Risk Group AAML1831'] == 'Low'],
                            save_plot=False,
                            save_survival_table=False,
                            add_risk_counts=False,
                            trialname=trial + ' Low Risk')

    draw_kaplan_meier(model_name=model_name,
                            df=dataset[dataset['Risk Group AAML1831'] == 'Standard'],
                            save_plot=False,
                            save_survival_table=False,
                            add_risk_counts=False,
                            trialname=trial + ' Standard Risk')
../_images/fcec0f2a0e49e7203ccec569e9d4b2871d34c6ac8442f15f1fd84320aa5909cd.png ../_images/6541d76ee9ef6aeb86bad83c3996cb84d556047e3733e1dd0b725d2f2c4b0e5b.png ../_images/cfefbbd4c9f1c5fe314b09356b9de07c18967d872692130cd5398f36d842d2b5.png

Forest Plots#

With MRD 1#

Hide code cell source
df_test['AML_Epigenomic_Risk'] = df_test['AML Epigenomic Risk'] 

def draw_forest_plot(time, event, df, save_plot=False, trialname=None, scorename=None):
    """
    Generates a custom forest plot.

    Parameters:
    ----------
    time: object
        List of mean coeficients from CoxPH fit.
        Note: this value has to be a pandas series.
    event: object
        Dataframe to add your results to.
    df: object
        A dataframe of variables/features that will be used to calculate the score.
    save_plot: bool, default=False
        Set to True if you wish to save the plot.It will be saved under "../Figures/ForestPlot/"
    trialname: str
        Name of your clinical trial or dataset.
    scorename: str
        Name of your model.

    Returns:
    --------
        A magnificent forest plot.

    """
    import myforestplot as mfp
    from tableone import TableOne
    import statsmodels.formula.api as smf
    import numpy as np
    
    fp = df[[scorename,
             'MRD 1 Status',
             'Risk Group',
             'FLT3 ITD',
             'Leucocyte counts (10⁹/L)',
             'Age group (years)',
             time, event]]

    event2 = event.replace('.', '_')
    time2 = time.replace('.', '_')

    if event[0] == 'o':
        event3 = 'OS'
    else:
        event3 = 'EFS'

    fp2 = fp.rename(columns={event: event2,
                             time: time2,
                             'MRD 1 Status': 'MRD_1_Status',
                             'FLT3 ITD': 'FLT3_ITD',
                             'Risk Group': 'Risk_Group',
                             'Leucocyte counts (10⁹/L)': 'WBC_count',
                             'Age group (years)': 'Age_group'})

    res = smf.phreg(formula=time2 + " ~ C("+scorename+",Treatment(reference='Low')) + C(MRD_1_Status) + C(Risk_Group,Treatment(reference='Low Risk')) + C(FLT3_ITD) + C(WBC_count) + C(Age_group)",
                    data=fp2, status=event2).fit()

    res2 = res.summary(xname=[scorename+'-High',
                              'MRD 1 Status-Positive',
                              'Risk Group-High Risk',
                              'Risk Group-Standard Risk',
                              'FLT3 ITD-Yes',
                              'Leucocyte counts (10⁹/L)-≥30',
                              'Age group (years)-≥10']).tables[1]

    res3 = res2.set_index(res2.index.str.split(pat='-', expand=True))

    mytable = TableOne(data=fp.drop(columns=[event, time]),
                       pval=False, missing=True, overall=True,
                       label_suffix=False, order={scorename: ['High'],
                                                  'MRD 1 Status': ['Positive'],
                                                  'Risk Group': ['High Risk', 'Standard Risk'],
                                                  'FLT3 ITD': ['Yes'],
                                                  'Leucocyte counts (10⁹/L)': ['≥30'],
                                                  'Age group (years)': ['≥10']}).tableone

    mytable2 = mytable.join(res3)

    mytable2["risk_pretty"] = mfp.add_pretty_risk_column(mytable2,
                                                         risk="HR",
                                                         lower='[0.025',
                                                         upper='0.975]',
                                                         fml=".2f"
                                                         )
    mytable3 = mytable2.reset_index(names=['category', 'item']).rename(columns={'HR': 'risk',
                                                                                '[0.025': 0,
                                                                                '0.975]': 1}).iloc[1:, :]

    mytable3['P>|t|'] = round(mytable3['P>|t|'], 4).replace(
        {np.nan: '', 0: '<0.0001'})

    plt.rcParams["font.size"] = 8
    fp = mfp.ForestPlot(df=mytable3,
                        ratio=[3, 3, 2],
                        fig_ax_index=[2],
                        dpi=300,
                        figsize=(9, 5),
                        vertical_align=True)
    fp.errorbar(index=2, errorbar_kwds=None)
    fp.axd[2].set_xlim([1, 8.5])
    fp.axd[2].set_xticks([0, 2, 4, 6, 8])
    fp.axd[2].set_xticklabels(labels=[0, 2, 4, 6, 8], fontdict={'fontsize': 8})
    fp.axd[2].set_xlabel("Hazard Ratio", fontsize=8)
    fp.axd[2].axvline(x=1, ymin=0, ymax=1.0, color="black", alpha=0.5)

    fp.axd[1].set_xlim([0.50, 1])
    fp.embed_cate_strings(1, "category", 0.5, header=trialname + " " + event3,
                          text_kwds=dict(fontweight="bold"),
                          header_kwds=dict(fontweight="bold"),
                          )
    fp.embed_strings(1, "item", 0.55, header="", replace={"age": ""})
    fp.embed_strings(1, "Overall", 0.86, header="n (%)")
    fp.embed_strings(3, "P>|t|", 0, header="P>|t|")
    fp.embed_strings(3, "risk_pretty", 0.4, header="Hazard Ratio (95% CI)")
    fp.horizontal_variable_separators()
    fp.draw_outer_marker(log_scale=False, scale=0.008, index=2)

    # Save plot figure
    if save_plot == True:
        plt.savefig('../Figures/Forest_Plots/' + scorename + '_' + trialname + '_' + str(len(df)) + '_' + event3 + '.png',
                    bbox_inches='tight', dpi=300)

    return (plt.show())


draw_forest_plot(time='os.time',
                    event='os.evnt',
                    df=df_test,
                    trialname='StJude trials:',
                    scorename='AML_Epigenomic_Risk',
                    save_plot=False)

draw_forest_plot(time='efs.time',
                    event='efs.evnt',
                    df=df_test,
                    trialname='StJude trials:',
                    scorename='AML_Epigenomic_Risk',
                    save_plot=False)
../_images/6b70d97c70bc9780079d8427593293cf40c15d8f780ffe878170bdecd71c5b71.png ../_images/f450d60de119be032a11158cbb0e689fa467e0d21d89bb2785b1b0ea21f5f54a.png

Without MRD 1#

Hide code cell source
# draw_forest_plot_noMRD(time='os.time',
#                     event='os.evnt',
#                     df=df_test,
#                     trialname='StJude trials:',
#                     scorename='AML_Epigenomic_Risk',
#                     save_plot=False)

# draw_forest_plot_noMRD(time='efs.time',
#                     event='efs.evnt',
#                     df=df_test,
#                     trialname='StJude trials:',
#                     scorename='AML_Epigenomic_Risk',
#                     save_plot=False)

ROC AUC#

Hide code cell source
# Your current preprocessing
df_test['Risk Group bins'] = df_test['Risk Group'].replace({'Low Risk':0, 'Standard Risk':0.5, 'High Risk':1})
df_test['MRD 1 bins'] = df_test['MRD 1 Status'].replace({'Negative':0, 'Positive':1})
df_test2 = df_test[['os.evnt', model_name + '_int', 'Risk Group bins', 'MRD 1 bins']].dropna()

# rename column `MethylScoreAML_Px_cat_bin` to `MethylScoreAML Px`
df_test2 = df_test2.rename(columns={model_name + '_int':model_name})

# Add new columns based on standardized values
df_test2['MRD1 + Risk Group'] = df_test['MRD 1 bins'] + df_test['Risk Group bins']
df_test2['MRD1 + Risk Group + MethylScore'] = df_test['MRD 1 bins'] + df_test['Risk Group bins'] + df_test[model_name + '_int']


import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer

def plot_roc_auc(df, score_columns, outcome_column, trial_name='validation cohort'):
    """
    Plots the ROC AUC curves for multiple models given a dataframe and multiple score columns.
    
    Parameters:
    - df (pd.DataFrame): Dataframe containing the score and outcome columns.
    - score_columns (list of str): List of names of columns that contain the scores.
    - outcome_column (str): The name of the column that contains the true outcomes.
    
    Returns:
    None
    """
    
    plt.figure()
    plt.title('ROC AUC in ' + trial_name + ', n={}'.format(len(df)))
    
    # plot random guessing line
    plt.plot([0, 1], [0, 1], 'r--')

    # binarize the outcome variable
    lb = LabelBinarizer()
    lb.fit(df[outcome_column])
    y = lb.transform(df[outcome_column])
    
    # Loop over score_columns to plot multiple ROC curves
    for score_column in score_columns:
        
        # calculate the fpr and tpr for all thresholds of the classification
        fpr, tpr, threshold = roc_curve(y, df[score_column])
        roc_auc = auc(fpr, tpr)
        
        # plot ROC curve for this score_column
        plt.plot(fpr, tpr, label=f'{score_column} AUC = %0.2f' % roc_auc)

    # set x and y limits
    plt.xlim([0, 1])
    plt.ylim([0, 1])

    # set x and y labels
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')

    # add legend
    plt.legend(loc='lower right')
    plt.show()

# Example Usage:
score_columns = [model_name, 'Risk Group bins', 'MRD 1 bins', 'MRD1 + Risk Group', 'MRD1 + Risk Group + MethylScore']
outcome_column = 'os.evnt'
plot_roc_auc(df_test2, score_columns, outcome_column)

Box Plots#

Hide code cell source
draw_boxplot(df=df_test,x='Risk Group', y='P(Dead)',
                order=['High Risk', 'Standard Risk', 'Low Risk'],
                trialname='StJude trials', hue=model_name,
                save_plot=False, figsize=None)

draw_boxplot(df=df_test,x='MRD 1 Status', y='P(Dead)',
                order=['Positive','Negative'],
                trialname='StJude trials', hue=model_name,
                save_plot=False, figsize=None)

draw_boxplot(df=df_test,x='Primary Cytogenetic Code', y='P(Dead)',
                order='auto',
                trialname='StJude trials', hue=model_name,
                save_plot=False, figsize=None)

Stacked Bar Plots#

Hide code cell source
draw_stacked_barplot(df=validation_clinical_data,x='MRD 1 Status', y=score_name,
             order=['Positive','Negative'],
             trialname='StJude trials', hue=score_name + ' Categorical',
             save_plot=False, figsize=None)

draw_stacked_barplot(df=validation_clinical_data,x='Risk Group', y=score_name,
                order=['High Risk', 'Standard Risk', 'Low Risk'],
                trialname='StJude trials', hue=score_name + ' Categorical',
                save_plot=False, figsize=None, fontsize=9)

draw_stacked_barplot(df=validation_clinical_data,x='Primary Cytogenetic Code', y=score_name,
                order='auto',
                trialname='StJude trials', hue=score_name + ' Categorical',
                save_plot=False, figsize=None, fontsize=6)

Patient Characteristics Table#

Overall study population#

Hide code cell source
from tableone import TableOne

columns = ['Age (years)','Age group (years)','Sex','Race or ethnic group',
            'Hispanic or Latino ethnic group', 'MRD 1 Status',
            'Leucocyte counts (10⁹/L)', 'BM Leukemic blasts (%)',
            'Risk Group', 'Clinical Trial','FLT3 ITD','Treatment Arm']

validation_clinical_data['Age (years)'] = validation_clinical_data['Age (years)'].astype(float)

mytable_cog = TableOne(validation_clinical_data, columns,
                        overall=False, missing=True,
                        pval=False, pval_adjust=False,
                        htest_name=True,dip_test=True,
                        tukey_test=True, normal_test=True,

                        order={'FLT3 ITD':['Yes','No'],
                                'Race or ethnic group':['White','Black or African American','Asian'],
                                'MRD 1 Status': ['Positive'],
                                'Risk Group': ['High Risk', 'Standard Risk'],
                                'FLT3 ITD': ['Yes'],
                                'Leucocyte counts (10⁹/L)': ['≥30'],
                                'Age group (years)': ['≥10']})

mytable_cog.to_csv(output_path + 'multivariate_cox_lasso/tableone_validation_cohort.csv')

mytable_cog.tabulate(tablefmt="html", 
                        headers=[score_name,"",'Missing','Validation Cohort'])

Including both discovery and validation cohorts#

Hide code cell source
# Load clinical data
discovery_clinical_data = pd.read_csv(input_path+'discovery_clinical_data.csv',
                                      low_memory=False, index_col=0)

discovery_clinical_data['Age (years)'] = discovery_clinical_data['Age (years)'].astype(float)

px = discovery_clinical_data.loc[ewas_top_cpgs.index]

dx = discovery_clinical_data
# [~discovery_clinical_data['ELN 2022 Diagnosis'].isin(['Mixed phenotype acute leukemia T/myeloid',
#                                        'Myeloid leukaemia associated with Down syndrome',
#                                        'AML with t(9;22)(q34.1;q11.2)/BCR::ABL1'])]
dx = dx[~dx['WHO 2022 Diagnosis'].isna()]

# Use only samples from df_index
dx = dx[dx.index.isin(pd.read_csv(output_path+'pacmap_output/pacmap_5d_output_acute_leukemia.csv', index_col=1).index)]

# join discovery clinical data with validation clinical data
all_cohorts = pd.concat([dx, px, validation_clinical_data],
                         axis=0, keys=['MethylScoreAML Dx Discovery','MethylScoreAML Px Discovery' ,'Validation'],
                         names=['cohort']).reset_index()


columns = ['Age group (years)','Sex', 'MRD 1 Status',
            'Leucocyte counts (10⁹/L)',
            'Risk Group','FLT3 ITD', 'Treatment Arm','Clinical Trial']

mytable_cog = TableOne(all_cohorts, columns,
                        overall=False, missing=False,
                        pval=False, pval_adjust=False,
                        htest_name=True,dip_test=True,
                        tukey_test=True, normal_test=True,

                        order={'FLT3 ITD':['Yes','No'],
                                'Race or ethnic group':['White','Black or African American','Asian'],
                                'MRD 1 Status': ['Positive'],
                                'Risk Group': ['High Risk', 'Standard Risk'],
                                'FLT3 ITD': ['Yes'],
                                'Leucocyte counts (10⁹/L)': ['≥30'],
                                'Age group (years)': ['≥10']},
                                groupby='cohort')

mytable_cog.to_excel('../data/tableone_both_cohorts.xlsx')

mytable_cog.tabulate(tablefmt="html", 
                        # headers=[score_name,"",score_name,'Validation','p-value','Statistical Test']
)

By MethylScore category#

Hide code cell source
from tableone import TableOne

columns = ['Age (years)','Age group (years)','Sex','Race or ethnic group',
            'Hispanic or Latino ethnic group', 'MRD 1 Status',
            'Leucocyte counts (10⁹/L)', 'BM Leukemic blasts (%)',
            'Risk Group', 'Clinical Trial','FLT3 ITD']

validation_clinical_data['Age (years)'] = validation_clinical_data['Age (years)'].astype(float)

mytable_cog = TableOne(validation_clinical_data, columns,
                        overall=False, missing=True,
                        pval=True, pval_adjust=False,
                        htest_name=True,dip_test=True,
                        tukey_test=True, normal_test=True,

                        order={'FLT3 ITD':['Yes','No'],
                                'Race or ethnic group':['White','Black or African American','Asian'],
                                'MRD 1 Status': ['Positive'],
                                'Risk Group': ['High Risk', 'Standard Risk'],
                                'FLT3 ITD': ['Yes'],
                                'Leucocyte counts (10⁹/L)': ['≥30'],
                                'Age group (years)': ['≥10']},
                        groupby=score_name + ' Categorical')

mytable_cog.to_csv(output_path + 'multivariate_cox_lasso/tableone_validation_methylscoreaml_px.csv')
mytable_cog.to_excel('../data/tableone_validation_methylscoreaml_px.xlsx')

mytable_cog.tabulate(tablefmt="html", 
                        headers=[score_name,"",'Missing','High','Low','p-value','Statistical Test'])

Watermark#